This document will explore the implementation of various types of plots that can be produced with base R graphics and/or the ggplot2 package. For each plot type we begin by showing basic plots and then enhance the plots by combining plot types, adding new variables using faceting, color, shape or size, or customizing colors, axes, axis labels, annotations or styles.
library(tidyverse)
library(ggplot2)
library(vioplot)
library(ggmosaic)
library(scales) # provides alpha() function to set alpha for base R plots
library(hexbin)
library(GGally)
library(RColorBrewer)
library(viridis)
library(geojsonio)
library(gridExtra)
library(kableExtra) # For nice looking tables in html output
library(reshape) # For the melt function in heatmap example
library(lubridate)
library(geojsonio)
library(rgdal)
library(rgeos)
library(broom)
library(ggsoccer)
library(ggridges)
library(gganimate)
library(nhlapi)
library(quantmod)
MM <- read.csv('data/MarchMadness.csv')
titanic <- read.csv("data/titanic.csv")
nba_players <- read_csv("data/nba_players.csv")
gender_spouse <- read.csv( file = "data/dataset_gender_spouse.csv")
steel <- read.csv("data/steel.csv")
data <- read.csv("data/titanic.csv", TRUE, sep = ",")
spdf <- geojson_read("https://raw.githubusercontent.com/leakyMirror/map-of-europe/master/GeoJSON/europe.geojson", what = "sp")
L <- read.csv('data/phone.csv')
spd <- geojson_read("data/us_states_hexgrid.geojson", what = "sp")
hockey <- read.csv("data/hockey_search.csv")
card <- read.csv('data/cards.csv')
Use the Iris dataset. Show distribution of petal width.
hist(iris$Petal.Width,
main = "Iris Petal Width Histogram",
xlab = "Petal Width")
hist(iris$Petal.Width,
main = "Iris Petal Width Histogram", xlab = "Petal Width", col='pink', breaks=20, xlim=c(0, 2.9), ylim=c(0, 40))
#Subsetting the different species
set <- iris$Sepal.Width[iris$Species == "setosa"]
ver <- iris$Sepal.Width[iris$Species == "versicolor"]
vir <- iris$Sepal.Width[iris$Species == "virginica"]
par(mfrow = c(3, 1))
hist(set, main = "Iris Setosa Petal Width", xlab = " Petal Width (Cm)", ylab="Frequency", col='orange', xlim=c(1.5, 6.5))
hist(ver, main = "Iris Versicolor Petal Width", xlab = " Petal Width (Cm)", ylab="Frequency", col='orange', xlim=c(1.5, 6.5))
hist(vir, main = "Iris Virginica Petal Width", xlab = " Petal Width (Cm)", ylab="Frequency", col='orange', xlim=c(1.5, 5.5))
ggplot(iris, aes(x=Petal.Width)) +
geom_histogram()
Changing the bins, color, outline color, adding title, and adding x and y labels
ggplot(iris, aes(x=Petal.Width)) +
geom_histogram(color="Blue", fill="orange", bins = 20) +
ggtitle("Iris Petal Width Histogram") +
xlab("Petal Width") +
ylab("Frequency")
ggplot(iris, aes(x=Petal.Width)) +
geom_histogram(color="yellow", aes(fill=Species), bins = 20) +
ggtitle("Iris Petal Width Histogram") +
xlab("Petal Width") +
ylab("Frequency")
ggplot(iris, aes(x=Petal.Width)) +
geom_histogram(color="blue", fill="orange", bins = 20) +
ggtitle("Iris Petal Width Histogram") +
xlab("Petal Width") + ylab("Frequency") +
facet_grid(. ~ Species)
#this is for more color
rf <- colorRampPalette(rev(brewer.pal(11,'Spectral')))
r <- rf(32)
#this is the actual setup of the plot
## MM is March Madness Data
x <- MM$Pointdiff
y <- MM$Seed
dataframe <- data.frame(x, y)
h <- hexbin(dataframe)
plot(h, colramp = rf, xlab = "Point Differential", ylab = "Seed")
#this is for more color
rf <- colorRampPalette(rev(brewer.pal(11,'Spectral')))
#this is the actual setup of the plot
MM <- MM %>%
mutate(Seeddiff = abs(Seed - Seed.1))
x <- MM$Seeddiff
y <- MM$Pointdiff
h<- hexbin(x, y, xbins = 20)
plot(h, colramp = rf,
xlab = "Seed Differential",
ylab = "Point Differential",
main = "Seed Differential vs Point Differential")
geom_hex()ggplot(MM, aes(x=Seeddiff, y=Pointdiff) ) +
geom_hex(bins = 30) +
scale_fill_continuous(type = "viridis") +
labs(x = "Seed Differential",
y = "Point Differential",
title = "Seed Differential vs Point Differential") +
theme_bw()
fit <- lm(Pointdiff ~ Seeddiff, data = MM)
r2 <- round(summary(fit)$r.squared, 2)
slope <- round(summary(fit)$coefficients["Seeddiff","Estimate"], 2)
ggplot(MM, aes(x=Seeddiff, y=Pointdiff)) +
geom_point(size = 3,
alpha = 0.20,
color = "steelblue") +
geom_smooth(method = lm,
se = FALSE,
linetype = "solid",
color = "red") +
labs(x = "Seed Differential",
y = "Point Differential",
title = "Higher Seed Differentials Are Associated with Higher Point Differentials") +
annotate("text", x = 1, y = 52,
label = as.expression(substitute(atop(R^2==r2, beta==slope),
list(r2=r2, slope=slope))),
vjust = "middle", hjust = "left") +
theme_bw()
## Warning in is.na(x): is.na() applied to non-(list or vector) of type
## 'expression'
MM %>%
group_by(Seeddiff) %>%
summarize(AvgPtDiff = round(mean(Pointdiff), 2)) %>%
kbl(align = "l") %>%
kable_styling()
| Seeddiff | AvgPtDiff |
|---|---|
| 0 | 8.69 |
| 1 | 9.22 |
| 2 | 11.27 |
| 3 | 9.43 |
| 4 | 9.52 |
| 5 | 10.20 |
| 6 | 7.89 |
| 7 | 9.97 |
| 8 | 12.01 |
| 9 | 11.16 |
| 10 | 11.38 |
| 11 | 13.10 |
| 12 | 13.20 |
| 13 | 17.07 |
| 15 | 24.90 |
Hmm… It seems like there isn’t much difference in average point differential for seed differentials ranging from 2 to 10. Let’s rerun the scatterplot and regression for that range of seed differentials.
MM_sub <- MM %>%
filter(between(Seeddiff, 2, 10))
fit <- lm(Pointdiff ~ Seeddiff, data = MM_sub)
r2 <- round(summary(fit)$r.squared, 2)
slope <- round(summary(fit)$coefficients["Seeddiff","Estimate"], 2)
ggplot(MM_sub, aes(x=Seeddiff, y=Pointdiff)) +
geom_point(size = 3,
alpha = 0.20,
color = "steelblue") +
geom_smooth(method = lm,
se = FALSE,
linetype = "solid",
color = "red") +
labs(x = "Seed Differential",
y = "Point Differential",
title = "Moderate Seed Differentials Don't Seem Very Important") +
annotate("text", x = 2, y = 45,
label = as.expression(substitute(atop(R^2==r2, beta==slope),
list(r2=r2, slope=slope))),
vjust = "middle", hjust = "left") +
scale_x_continuous(breaks = 2:10, limits = c(2, 10)) +
theme_bw()
## Warning in is.na(x): is.na() applied to non-(list or vector) of type
## 'expression'
# get a dataset from R
# Orange dataset has information about the growth of orange trees
orange <- Orange
# create a density plot
plot(density(orange$circumference),
main = "CIRCUMFERENCE OF ORANGE TREES",
xlab = "CIRCUMFERENCE",
ylab = "DISTRIBUTION")
polygon(density(orange$circumference), col=7, border=2)
# create the density plot
orange %>%
ggplot( aes(x = circumference)) +
geom_density(fill= 7, color= 2)
Use the same histogram created in the “Customized Histogram Using Base R” section and overlay a density curve on it.
x <- iris$Petal.Width
h <- hist(x, breaks=20, col="pink", freq=FALSE, xlab= "Petal Width", main="Histogram With Density Curve")
lines(density(x), lwd=2, col="blue")
ggplot(data=diamonds,
aes(x=price, fill = cut)) +
geom_density(adjust = 1.5) +
facet_wrap(~ cut) +
theme(legend.position = "none")
This is an example taken directly from the R-Graph-Gallery to show the diamonds dataset used in the plot above.
library(ggridges)
ggplot(diamonds, aes(x = price, y = cut, fill = cut)) +
geom_density_ridges() +
theme_ridges() +
theme(legend.position = "none")
## Picking joint bandwidth of 458
This ridgeline plot shows the distributions of chick weights when fed different diet types.
ggplot(ChickWeight, aes(x = weight, y = Diet, fill = Diet)) +
geom_density_ridges() +
theme_ridges() +
theme(legend.position = "none") +
ggtitle("Effects of Diet on Chick Weight") +
ylab("Chick Diet Type") +
xlab("Chick Weight")
## Picking joint bandwidth of 24
###Base R
### Create a data set to work with
survey <- c(apple=45, orange=25, honeydew=35, banana=50, watermelon=20, blueberry=15)
### Create a barchart with titles and color
barplot(survey,
main="Favorite Fruit Survey",
xlab="Fruit",
ylab="Count",
col=c("red", "orange", "green", "yellow", "pink", "blue"))
### Add a legend
legend("topright",
legend = c("apple", "orange", "honeydew", "banana", "watermelon", "blueberry"),
col = c("red", "orange", "green", "yellow", "pink", "blue"),
pch = 19,
bty = "n",
pt.cex = 2,
cex = 1.2)
# Use the same data set created in the base r example but make it into a data frame
survey1 <- data.frame(
fruit=c("apple", "orange", "honeydew", "banana", "watermelon", "blueberry"),
value=c(45, 25, 35, 50, 20, 15))
# Create the barchart
ggplot(survey1, aes(x=fruit, y=value)) + geom_bar(stat="identity")
# Add color with black outlines
ggplot(survey1, aes(x=fruit, y=value)) + geom_bar(stat="identity", fill=c("red", "orange", "green", "yellow", "pink", "blue"), color="black") + labs(x="Count", y="Fruit", title="Fruit Survey") + theme(legend.position = "none")
First, calculate survival percentage and class count for each passenger class, using the aggregate() function, and then merge those two results into a data frame from the titanic dataset titanic.
# calculate survival percentage by class and count by class.
surv_pct <- aggregate(Survived ~ Pclass, data = titanic, FUN = mean)
class_ct <- aggregate(Survived ~ Pclass, data = titanic, FUN = length)
# Merge those two results into a single data frame, and specify
# column names
by_class_data <- merge(surv_pct, class_ct,
by = "Pclass")
colnames(by_class_data) <- c("Pclass", "Surv_Pct", "Count")
#str(by_class_data)
Create the barplot. Note that we increase the ylim parameter a bit to leave room for labels above the bars. We also set axes = FALSE so that the default y-axis is not plotted. Note also that we assign the barplot object to a variable. What gets assigned to the variable is the x positions of the midpoints of each of the bars on the plot. We will use those x positions later to position the labels above the bars, since the x position for a text label specifies the center of the label.
In the next step we create a custom y-axis with the axis() function. The at parameter specifies the placement of the tick marks on the axis (“axis ticks”) and the labels parameter specifies the tick labels. Use the paste() function to make percentage labels.
Finally, we create the labels showing the count of passengers in each class and place them right above the bars. The x position comes from the barplot object and the y position is just the height of the bars plus a little extra so that the text labels don’t overlap the bars.
plt <- barplot(by_class_data$Surv_Pct,
names.arg = paste("Class", by_class_data$Pclass),
col = 'steelblue',
main = "Survival Percentage by Passenger Class",
ylab = "Survival Percentage",
xlab = "Passenger Class",
ylim = c(0.0, 0.75),
axes = FALSE
)
# Add y axis
axis(2, at = seq(0.0, 0.7, by = 0.1),
labels = paste(seq(0, 70, by = 10), "%", sep=""),
las = 1,
cex.axis = 0.9)
# Add the text of the labels
text(x = plt,
y = by_class_data$Surv_Pct + 0.04,
labels = paste("n: ", by_class_data$Count, sep=""),
cex=1)
First, calculate survival percentage and class count for each passenger class using titanic dataset using tidyverse approach.
survival <- titanic %>%
group_by(Pclass) %>%
summarize(Count = n(),
Surv_Pct = mean(Survived))
Make the plot.
ggplot(survival, aes(x = Pclass, y = Surv_Pct)) +
geom_col(fill = "steelblue") +
geom_text(aes(label = paste("n: ", Count, sep="")), vjust = -0.5) +
xlab("Passenger Class") +
ylab("Survival Percentage") +
ggtitle("Survival Percentage by Passenger Class") +
scale_y_continuous(limits = c(0, 0.70),
breaks = seq(0, .7, by = 0.10),
labels = paste(seq(0, 70, by = 10), "%", sep="")) +
theme_classic() +
theme(plot.title = element_text(hjust = 0.5)) # Centers title
x <- replicate(4, rnorm(100))
apply(x, 2, mean)
## [1] 0.06270582 0.09459917 -0.08199359 0.15145202
barplot(matrix(c(5,3,8,9),nr=2), beside=T,
col=c("aquamarine3","coral"),
names.arg=LETTERS[1:2])
legend("topleft", c("A","B"), pch=15,
col=c("aquamarine3","coral"),
bty="n")
Using gender_spouse dataset.
names(gender_spouse)
## [1] "gender" "spouse" "total"
# check the number of students of each gender
##side-by-side bars
ggplot(gender_spouse, aes(y=total, x=gender, fill=spouse))+
geom_bar(stat="identity" ,position = "dodge")
##Creating a general circular barplot
##Creating a data frame
data <- data.frame(
colors=c("red", "orange", "yellow", "green", "blue", "purple", "pink", "black", "white"),
value=c(50, 35, 80, 60, 75, 65, 55, 90, 10)
)
##Making the plot
p <- ggplot(data, aes(x=as.factor(colors), y=value)) +
##Numeric value in alpha changes how bright or vibrant the color in the graph appears
geom_bar(stat="identity", fill=alpha("red", 0.5)) +
##Adding limits
ylim(-120, 150) + theme_minimal() + theme(
axis.text = element_blank(),
axis.title = element_blank(),
panel.grid = element_blank(),
plot.margin = unit(rep(-2, 4), "cm")) +
##Making the coordinate polar instead of cartesian
coord_polar(start = 0)
p
data <- data.frame(
id=seq(1, 9),
individual=paste( c("red ", "orange ", "yellow ",
"green ", "blue ", "purple ",
"pink ", "black ", "white "), seq(1,9), sep=""),
value=sample( seq(10, 100), 9, replace=T)
)
##Prepping the data to create labels for the graph
# Get the name and the y position of each label
label_data <- data
# calculating the angles on the label
number_of_bar <- nrow(label_data)
angle <- 90 - 360 * (label_data$id-0.5) /number_of_bar # 0.5 is subtracted because the text needs to be on the angle of the center of the bars.
# calculate the alignment of labels: right or left
# If I am on the left part of the plot, my labels have currently an angle < -90
label_data$hjust<-ifelse( angle < -90, 1, 0)
# flip angle BY to make them readable
label_data$angle<-ifelse(angle < -90, angle+180, angle)
# ----- ------------------------------------------- ---- #
# Creating the plot
p <- ggplot(data, aes(x=as.factor(id), y=value)) +
geom_bar(stat="identity", fill=alpha("red", 0.5)) +
ylim(-100,120) +
theme_minimal() +
theme(
axis.text = element_blank(),
axis.title = element_blank(),
panel.grid = element_blank(),
plot.margin = unit(rep(-1,4), "cm")
) +
coord_polar(start = 0) +
# Add the labels, using the label_data dataframe that we have created before
geom_text(data=label_data, aes(x=id, y=value+10, label=individual, hjust=hjust), color="black", fontface="bold",alpha=0.6, size=2.5, angle= label_data$angle, inherit.aes = FALSE )
p
This time series shows the water level in Lake Huron from 1875 to 1972.
The original dataset from R is a time series, and in order to plot it I had to transform to a dataframe and then plot using that instead.
lakehuron <- data.frame(LakeHuron)
ggplot(lakehuron, aes(x=(1875:1972), y= LakeHuron)) +
geom_line() +
ggtitle("WATER LEVELS IN LAKE HURON") +
xlab("") +
ylab("WATER LEVEL")
## Don't know how to automatically pick scale for object of type ts. Defaulting to continuous.
The first step is to get the data and create the conditional variable(s). For this plot I will use discharge data from the Youghiogheny River to visualize flood events. What even is a flood? There are many ways that flooding events can be quantified, but among the simplest are to see when discharge exceeds 3 & 7 times the monthly median. Here is how to process the data for that.
# The URL used to query the USGS database
# includes parameters including data type, site number, and dates
URL = 'https://waterdata.usgs.gov/usa/nwis/dv?cb_00060=on&format=rdb&site_no=03075500&referred_module=sw&period=&begin_date=2018-01-01&end_date=2020-12-31'
# data is displayed with some extra comments so it takes a little additional processing to clean up
d = read.delim(URL, comment.char = '#', stringsAsFactors = F)
d=d[-1,]
d$datetime=as.POSIXct(d$datetime)
colnames(d)[4]='discharge'
d$discharge = as.numeric(d$discharge)
# creating month & year variables for aggregation
d$month=month(d$datetime)
d$year=year(d$datetime)
# creating a data frame of median discharges for each year & then upper thresholds as described by Olden & Poff (2003)
# aggregating for median using unique months & years
aggDischarge=aggregate(data=d, discharge~year+month, FUN=median, na.rm=TRUE)
colnames(aggDischarge)[3]='monthly.median' #rename column
# merging the calculated medians back into the main data frame. all.x=T prevents any of the original data from being removed by allowing monthly medians to be copied as many times necessary
d=merge(d, aggDischarge, all.x=T)
#identify flood events, upper thresholds are defined as 3 and 7 times monthly median discharge
d$flood.threshold3=d$monthly.median*3
d$flood.threshold7=d$monthly.median*7
# creating a factor variable for each reading indicating if a flood event threshold is exceeded
d$flood.event3=ifelse(d$discharge>d$flood.threshold3,1,0)
d$flood.event7=ifelse(d$discharge>d$flood.threshold7,1,0)
# if discharge surpasses 7 medians, then logically it will be above 3 medians as well. To create a single variable with three factor levels (non-flood, 3, & 7), we can simply add the flood.event columns together to create values of 0, 1, & 2
d$flood.event=as.factor(d$flood.event3+d$flood.event7)
There are two important aesthetics to set to get this to work. 1. set color = flood.event This tells R to use the variable to change the line color by the associated value
group = 1 By default, setting a color separates the factor levels into different lines. This tells R to keep the data together.ggplot(d, aes(x=datetime, y=discharge,
color = flood.event, group = 1))+
labs(title = 'Flood Events of the Youghiogheny River near Oakland, MD', x = 'year', y = 'discharge (cu. ft/sec)',
color='Flood Event')+
geom_line(size=0.75)+
# customize the line colors and legend labels with scale_color_manual()
scale_color_manual(values=c('black', 'orange', 'red'),
labels=c('non-flood', 'moderate', 'heavy'))+
scale_y_continuous(expand = c(0, 0))+ # removing some empty space between the x axis and title
theme_classic()
`steel’ Data is total steel produced annually from the 5 largest steel producing countries Make the plot. geom_area() is the function that produces the stacked area plot.
ggplot(steel, aes(x=year, y=Steel, fill=Country)) +
geom_area() + ggtitle("Annual Steel Produced, Top 5 Countries (Million Tons)") + ylab("Steel Produced (Million Tons)") + xlab("Year")
ggplot(steel, aes(fill=Country, y=Steel, x=year)) +
geom_bar(position="fill", stat="identity") + ggtitle("Share of Steel Produced Annually") + ylab("% of Steel Produced") + xlab("Year")
In the Boxplot section add two basic boxplot examples, one utilizing base R and one utilizing ggplot2. Note that even basic examples should have appopriate titles and axis labels.
# Some data to work with:
data(ChickWeight)
#Boxplot:
boxplot(ChickWeight$weight ~ ChickWeight$Time,
main = "Chick Age versus Chick Weight",
xlab = "Chick Age in Days",
ylab = "Chick Weight in Grams")
p <- ggplot(ChickWeight, aes(group=Time, y=weight)) +
geom_boxplot() +
labs(title="Chick Age versus Chick Weight",x="Chick Age in Days", y = " Chick Weight in Grams")+
theme_classic()
p
Using Jitter, you can display the individual data points in each box in a way that is easy to see and understand.
This example is based on one found at the R Graph Gallery website: https://www.r-graph-gallery.com/89-box-and-scatter-plot-with-ggplot2.html
data("ChickWeight")
str(ChickWeight)
## Classes 'nfnGroupedData', 'nfGroupedData', 'groupedData' and 'data.frame': 578 obs. of 4 variables:
## $ weight: num 42 51 59 64 76 93 106 125 149 171 ...
## $ Time : num 0 2 4 6 8 10 12 14 16 18 ...
## $ Chick : Ord.factor w/ 50 levels "18"<"16"<"15"<..: 15 15 15 15 15 15 15 15 15 15 ...
## $ Diet : Factor w/ 4 levels "1","2","3","4": 1 1 1 1 1 1 1 1 1 1 ...
## - attr(*, "formula")=Class 'formula' language weight ~ Time | Chick
## .. ..- attr(*, ".Environment")=<environment: R_EmptyEnv>
## - attr(*, "outer")=Class 'formula' language ~Diet
## .. ..- attr(*, ".Environment")=<environment: R_EmptyEnv>
## - attr(*, "labels")=List of 2
## ..$ x: chr "Time"
## ..$ y: chr "Body weight"
## - attr(*, "units")=List of 2
## ..$ x: chr "(days)"
## ..$ y: chr "(gm)"
ChickWeight %>%
ggplot( aes(x=Diet, y=weight, fill=Diet)) +
geom_boxplot() +
scale_fill_viridis(discrete = TRUE, alpha=0.6) +
geom_jitter(color= "red", size=0.6, alpha=0.9) +
theme(
legend.position="none",
plot.title = element_text(size=14)
) +
ggtitle("Chick Weight by Diet") +
ylab("Chick Weight in Grams") +
xlab("Chick Diet")
Pulling data from OKC, UTA, and CHI from nba_players dataset
team <- nba_players$TEAM[nba_players$TEAM %in% c("OKC", "UTA", "CHI")]
pts <- nba_players$PTS[nba_players$TEAM %in% c("OKC", "UTA", "CHI")]
data <- data.frame(team, pts)
with(data , vioplot(
pts[team=="OKC"], pts[team=="UTA"], pts[team=="CHI"],
col= c(255,234,268) , names=c("OKC","UTA","CHI"),
xlab = "Teams",
ylab = "Points",
main = "NBA Points Per Team",
ylim = c(0,27),
border = c(sample(200:900, 3)),
rectCol = c(893)
))
I did add a boxplot on top of the violin plot to make it resemble the base r graph.
ggplot(data, aes(x= team, y = pts, fill = team)) +
geom_violin(trim = F, show.legend = T) +
geom_boxplot(width = 0.1)
The first example was created using https://r-charts.com/distribution/violin-plot-points-ggplot2/.
df <- ToothGrowth
# Convert the dose column to a factor to be the categorical variable
df$dose <- as.factor(df$dose)
head(df)
## len supp dose
## 1 4.2 VC 0.5
## 2 11.5 VC 0.5
## 3 7.3 VC 0.5
## 4 5.8 VC 0.5
## 5 6.4 VC 0.5
## 6 10.0 VC 0.5
# Create the basic violin plot to see the correlation between tooth length and dose of vitamin c
ggplot(df, aes(x=dose, y=len)) +
geom_violin()
# Add color to the plot
ggplot(df, aes(x=dose, y=len, fill=dose)) +
geom_violin()
# Add points to the plot
ggplot(df, aes(x=dose, y=len, fill=dose)) +
geom_violin() +
geom_point(position = position_jitter(seed=1, width=0.2))
The geom_point argument is used to add the points to the violin plot. It is always recommended to add some jitter to those points with the position_jitter argument. The seed value is a randomly generated number, while the width represents the amount of jitter from each points.
In this plot, we will see the type of vitamin c, either the vitamin itself, or orange juice versus the tooth length.
ggplot(df, aes(x=supp, y=len, fill=supp)) +
geom_violin() +
geom_jitter(position=position_jitter(seed=1, width=0.2))
Boxplots are often used on top of other plots to show descriptive statistics.
ggplot(df, aes(x=supp, y=len, fill=supp)) +
geom_violin(show.legend = F) +
geom_boxplot(width = 0.2)
Here we can use the alpha() function from the scales package to set an alpha (transparency) level for the color, ranging from 0 (totally transparent) to 1 (totally translucent). method - 'jitter' adds random noise in the y-dimension and the jitter() function applied to the variable being plotted adds random noise to the actual values of the variable. amount = 0 causes that random noise to be z/50 where z is the range of the variable.
stripchart(jitter(iris$Sepal.Width, amount = 0),
main = "Sepal Width Distribution",
xlab = "Sepal Width (cm)",
col= alpha("blue", 0.5),
pch = 19,
method= 'jitter')
With ggplot2’s geom_jitter() is a shortcut for geom_point(position = "jitter"). Its width and height parameters control the amount of random noise in the x and y dimensions, respectively. With ggplot2 we can set alpha directly without the use of any helper functions. However, with ggplot2 it is a bit tricky to make a strip chart for just one variable (without a second variable for grouping), as we need to set the y variable, set axis limits, and suppress the axis ticks and labels for the unused axis.
ggplot(data = iris) +
geom_jitter(aes(x=Sepal.Width, y=0),
width = 0.05,
height = 0.2,
color = "blue",
alpha = 0.5,
pch = 19,
cex = 2) +
scale_y_continuous(limits = c(-1, 1),breaks = NULL, labels = NULL) +
labs(title = "Sepal Width Distribution",
y = NULL,
x = "Sepal Width (cm)")
palette(brewer.pal(n = 3, name = "Dark2"))
stripchart(Age ~ Pclass, data = titanic,
vertical = TRUE,
method = "jitter",
pch = 1,
col = 1:3,
main = "Titanic Passenger Age by Class",
xlab = "Class")
Here is an example where the ggplot2 plot takes more work than the base R plot. Need to change Pclass to a factor or manipulate the x-axis scale to make sure it only shows 1, 2, and 3. As long as we are changing Pclass to a factor anyway let’s make the labels for the classes “First”, “Second”, and “Third”.
titanic <- titanic %>%
mutate(Pclass = factor(Pclass, labels = c("First", "Second", "Third")))
Used the scale_color_brewer() function to set this plot to the same RColorBrewer palette as the base R example.
ggplot(data = titanic, aes(x = Pclass, y = Age, color = Pclass)) +
geom_jitter(pch = 1, cex = 2, width = 0.12) +
scale_color_brewer(palette = "Dark2") +
theme_bw() +
theme(legend.position = "none") +
labs(title = "Titanic Passenger Age by Class",
x = "Class",
y = "Age")
This is fairly easy to do with ggplot2, as geom_jitter() by default takes two variables as parameters. One can be the categorical group, iris species.
ggplot(data = iris) +
### Add Boxplot on top of the strip plot for iris species. - used https://www.youtube.com/watch?v=mbvU8fF4eGM&t=613s as a reference
geom_boxplot(mapping = aes(x = Sepal.Width, y = Species, fill = Species),
alpha = 0.25,
width = 0.5,
show.legend = FALSE) +
geom_jitter(aes(y=Species, x=Sepal.Width, color = Species),
width = 0.05,
height = 0.2,
pch = 19,
cex = 2) +
labs(title = "Sepal Width Distribution by Iris Species",
x = "Sepal Width (cm)",
y = NULL) +
theme_bw() +
theme(legend.position = "none")
Strip charts are particularly useful with small datasets. The NBA data has a small number of players per team, so let’s try to use strip charts to compare the distribution of scoring on NBA teams, similar to the comparisons in the violin plot section.
nba_players %>%
filter(TEAM %in% c("OKC", "UTA", "CHI")) %>%
ggplot(aes(x =PTS, y = TEAM)) +
geom_point() +
labs(title = "Scoring Distribution on Selected NBA Teams")
top_ten <- nba_players %>%
filter(TEAM %in% c("OKC", "UTA", "CHI", "WAS")) %>%
group_by(TEAM) %>%
arrange(desc(PTS), .by_group = TRUE) %>%
mutate(rn = row_number()) %>%
filter(rn <= 10)
top_one <- nba_players %>%
filter(TEAM %in% c("OKC", "UTA", "CHI", "WAS")) %>%
group_by(TEAM) %>%
filter(row_number(desc(PTS)) == 1)
ggplot(data = top_ten, aes(x =PTS, y = TEAM)) +
geom_point() +
scale_x_continuous(breaks = seq(5, 30, 5)) +
geom_text(aes(label = PLAYER),
data = top_one,
nudge_x = 0.25,
angle = 90,
alpha = 0.5,
vjust = "top",
hjust = "middle") +
labs(title = "Scoring Distribution of Top 10 Scorers") +
theme_bw() +
theme(panel.grid.major.y = element_blank()) +
geom_boxplot(alpha = 0,
linetype = 3)
This example shows a strip plot to compare chick weights with different types of feed.
#in ggplot2:
ggplot(data = chickwts) +
geom_jitter(aes(y=feed, x=weight, color = feed),
width = 0.05,
height = 0.2,
alpha = 0.5,
pch = 19,
cex = 2) +
labs(title = "Chick Weight by Feed Type",
x = "Chick Weight in Grams",
y = "Feed Type") +
theme_bw() +
theme(legend.position = "none")
A second example, this time using the Carbon Dioxide Uptake in Grass Plants dataset
ggplot(data = CO2) +
geom_jitter(aes(y=Treatment, x=uptake, color = Treatment),
width = 0.05,
height = 0.2,
alpha = 0.5,
pch = 19,
cex = 2) +
labs(title = "Carbon Dioxide Uptake in Grass Plants by Treatment Type",
x = "Carbon Dioxide Uptake",
y = "Treatment Type") +
theme_bw() +
theme(legend.position = "none")
A quantile-quantile (“QQ”) plot is used to compare an empirical distribution to a theoretical distribution by plotting the observed quantiles vs the theoretical quantiles. They are typically used to evaluate whether or not a variable is normally distributed.
The built-in rivers dataset gives the lengths (in miles) of 141 rivers in North America, as compiled by the US Geological Survey. Let’s see if the river lengths are normally distributed by creating a normal probability plot of the river lengths.
Here we create a basic QQ plot in base R and add the normal distribution QQ plot as a reference line for comparison.
qqnorm(rivers,
main = "Q-Q Plot of North American River Lengths")
qqline(rivers, col = "red")
Here we produce a similar plot using ggplot2. First we need to put the river lengths in a data frame.
rivers_df <- data.frame(length = rivers)
Now we make the plot. Note that the shape and size of the geom are revised to better match the base R plot.
ggplot(rivers_df, aes(sample = length)) +
stat_qq(shape = 1, cex = 2) +
stat_qq_line(color = "red") +
labs(title = "Q-Q Plot of North American River Lengths",
x = "Theoretical Quantiles",
y = "Sample Quantiles") +
theme_classic() +
theme(plot.title = element_text(hjust = 0.5)) # Centers title
First, we will mosaic plot of Titanic ticket class vs embarkment location using titanic dataset. First check to see if there is any missing data.
table(titanic$Pclass)
##
## First Second Third
## 186 173 355
table(titanic$Embarked)
##
## C Q S
## 2 130 28 554
Notice that there are two cases for which Embarked is blank. We will remove those cases.
mask <- titanic$Embarked %in% c("C", "Q", "S")
embarked <- titanic$Embarked[mask]
class <- titanic$Pclass[mask]
Make the mosaic plot.
mosaicplot(table(class, embarked),
main="Passenger Class by Embarkment Site",
xlab="Class",ylab="Embarkment Site",
col=c(2,3,4))
Let’s make a similar mosaic plot using ggplot2. We need to use the ggmosaic package to do so.
titanic_clean <- titanic %>%
filter(Embarked %in% c("C", "Q", "S")) %>% # Removes cases with missing data
mutate(
Embarked = factor(embarked)
)
titanic_m_data <- titanic_clean %>%
count(Embarked, Pclass)
ggplot(data = titanic_m_data) +
geom_mosaic(aes(x = product(Pclass), fill = Embarked, weight = n)) +
xlab("Passenger Class") +
ylab("Embarkment Site") +
ggtitle("Passenger Class by Embarkment Site")
Creating a scatterplot in base R is simple. For this example, I used the dataset pennsylvania-history, which shows the history of pennsylvania’s recorded covid cases from January to March 7. The variables of interest are positive, the number of positive cases, and deaths. Did the death count rise as cases grew?
First, we read the data file, pennsylvania-history.csv and store it as covidchart. Our x values in the plot function are covidchart\(positive, and our yvalue is covidchart\)death. Our title, as explaieid in main is “Covid Infection and Fatalities.” Our x-axis is labeled Infections, for covid infections, and our y axis is labeled fatalities. Note that these fatalities are not explciitly confirmed to be from Covid, but are the overall fatality count in PA over time, each point representing a different count of infection and fatality per day. We then draw a regression line based on positive cases and fatalities. The regression line is colored red and the points are colored grey after the color scheme of the virus itself.
covidchart <- read.csv("data/pennsylvania-history.csv")
covidchart <- subset(covidchart, !is.na(positive) & !is.na(death))
plot(covidchart$positive, covidchart$death, main = "Covid Infection and Fatalities", xlab = "Infections", ylab = "Fatalities", col = "grey")
abline(lm(covidchart$death ~ covidchart$positive), col = "red")
We can see that, yes, positive cases unfortunately have a positive correlation with fatalities.
Another way to color your plot is by using rcolorbrewer. We first have to load rcolorbrewer. Note that we will be using an example barplot instead of a scatterplot.
dummy <- data.frame(colno = c("col1", "col2", "col3", "col4", "col5", "col6", "col7", "col8", "col9"), colcount = c(1, 2, 3, 4, 5, 6, 7, 8, 9))
Then, execute this to see all the available palettes.
display.brewer.all()
To view a single palette, type
display.brewer.pal(n, name)
name denotes the palette you want to display, while n denotes how many colors are in that palette. For example, the following code shows you the first four colors in the Pastel1 palette.
display.brewer.pal(4, "Pastel1")
brewer.pal(n, name)
returns the hex codes of the n colors in the named palette as a vector. Here we create a palette of all 9 colors in Pastel1 palette.
Past <- brewer.pal(9, "Pastel1")
From there, go back to your plot’s color argument and use the following code to set the colors of the bars to the first color in the “Past” palette we made with RColorBrewer in the code chunk above.
barplot(height = dummy$colcount, names = dummy$colno, xlab = "colors", ylab = "count", ylim = c(0,9), col = Past[1])
Alternatively, you could use the following code use every color in the palette to color the bars.
barplot(height = dummy$colcount, names = dummy$colno, xlab = "colors", ylab = "count", ylim = c(0,9), col = Past)
ggplot2 is much the same. we set covidchart as the source of our data and positive and death as our x and y values in aes(). geom_point() is called to make ggplot graph this as a scatterplot. The labs() function labels our x and y axes, and the graph overall. To add a regression line, we use geom_smooth(method = lm).
ggplot(data = covidchart, aes(positive, death)) + geom_point(colour = "red") + labs(x = "Infections", y = "Fatalities", title = "Covid Infection and Fatalities") + geom_smooth(method = lm)
basket <- nba_players
ggplot(data = basket, aes(x = AGE, y = PTS)) +
geom_point(alpha = 1, aes(color = PTS))+
geom_density_2d()+
ggtitle('NBA Player Age to Points Made')
options(scipen=999)
#this turns the scientific notation off
theme_set(theme_bw())
#^ this helps get a good theme for this graph
data("midwest", package = "ggplot2")
gg <- ggplot(midwest, aes(x=area, y=poptotal)) +
geom_point(aes(col=state, size=popdensity)) +
geom_smooth(method="loess", se=F) + #this gets us the line of best fit
xlim(c(0, 0.1)) +
ylim(c(0, 500000)) +
labs(y="Population",
x="Area",
title="Area Vs Population",
caption = "Source: midwest")
plot(gg)
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 15 rows containing non-finite values (stat_smooth).
## Warning: Removed 15 rows containing missing values (geom_point).
In order to use an RColorBrewer palette in a barplot in ggplot2, you have to first save your barplot in a variable, and then append the function scale_fill_brewer() to it, with the argument palette =“palettename” inside the function. Don’t forget to add code setting “fill” equal to your x variables (i.e. your bars), otherwise you’ll get a plain grey bar graph. Done correctly, code should look like this:
ggplot(data = dummy, aes(x = colno, y = colcount, fill = colno)) +
geom_bar(stat = "identity") +
labs(x = "ColorNumber", y = "ColorCount", title = "Color Dummy Data") +
scale_fill_brewer(palette = "Pastel1")
Base R plots require a bit finessing to get just right.
First of all, I plan to place the legend outside the plot itself so I must first change the margins. This is done with par() which can set graphical parameters. Within par(), mar takes a vector input of 4 numbers determining how far into the pane the plot margins should be, starting at the bottom and continuing clockwise. xpd takes a logical value & setting it to TRUE tells R not to clip the legend or any other insertion outside the plot bounds. inset=c(-0.35,0) tells R to move the legend away from the plot, outside the margin horizontally. To make the legend look a bit cleaner, use bty='n' to remove the box around the legend.
To set the point size in base plot, use cex=data$variable. In this case cex=mtcars$disp. For the legend, use pt.cex=c(n1, n2, n3...nx) for however many legend items you have. I’m choosing intervals of 100 from 100 to 400.
par(mar=c(4, 5, 2, 8), xpd = T) # bottom, left, top, right
plot(mtcars$mpg, mtcars$hp, cex = mtcars$disp,
pch=16,
main = 'Miles per Gallon vs Gross Horsepower with Displacement',
xlab = 'miles per gallon',
ylab = 'gross horspower')
legend('right', inset=c(-0.35,0),
legend=c('100', '200', '300', '400'),
title = 'Displacement\n(cu. in.)',
pch=16, ncol=1, pt.cex = c(100, 200, 300, 400),
bty = 'n')
Yikes, it seems like the points may be a bit too large. Lets make them all smaller by dividing their values by 100 & take care not to forget about the legend points as well by changing the values for pt.cex.
par(mar=c(4, 5, 2, 8), xpd = T) # bottom, left, top, right
plot(mtcars$mpg, mtcars$hp, cex = mtcars$disp/100,
pch=16,
main = 'Miles per Gallon vs Gross Horsepower with Displacement',
xlab = 'miles per gallon',
ylab = 'gross horspower')
legend('right', inset=c(-0.35,0),
legend=c('100', '200', '300', '400'),
title = 'Displacement\n(cu. in.)',
pch=16, ncol=1, pt.cex = c(100, 200, 300, 400)/100,
bty = 'n')
Better, but the points don’t seem to be proportional. It seems that cex scales points by altering the radius of a point. This would cause the area of each consecutive point to be exponentially larger than the last when using the formula \(A=πr^2\). To correct this and make the point size proportional to the area, rearrange the formula to solve for the radius that produces the desired area. \(r=\sqrt{\frac{A}{π}}\)
Now the points are proportional in area but still too large, so I divide the areas by 5.
par(mar=c(4, 5, 2, 8), xpd = T) # bottom, left, top, right
plot(mtcars$mpg, mtcars$hp, cex = sqrt(mtcars$disp/3.14)/5,
pch=16,
main = 'Miles per Gallon vs Gross Horsepower with Displacement',
xlab = 'miles per gallon',
ylab = 'gross horspower')
legend('right', inset=c(-0.35,0),
legend=c('100', '200', '300', '400'),
title = 'Displacement\n(cu. in.)',
pch=16, ncol=1, pt.cex = sqrt(c(100, 200, 300, 400)/3.14)/5,
bty = 'n')
Now it looks right!
When I need to make a plot, I usually prefer ggplot() over base r for its simplicity. As you can see, I was able to make a clean graph with just 5 lines of code. To show displacement as a variable point size, you set size=disp within the aesthetics of ggplot. Ggplot automatically scales by area & makes the points reasonable sizes as well. Because mtcars is set as the data source, there is no need to reference the data frame for each variable.
ggplot(mtcars, aes(x=mpg, y=hp, size=disp))+
geom_point()+
ggtitle('Miles per Gallon vs Gross Horsepower with Displacement')+
labs(size = 'Displacement\n(cu. in.)', x='miles per gallon', y='gross horsepower')+
theme_classic()
data(swiss)
str(swiss)
## 'data.frame': 47 obs. of 6 variables:
## $ Fertility : num 80.2 83.1 92.5 85.8 76.9 76.1 83.8 92.4 82.4 82.9 ...
## $ Agriculture : num 17 45.1 39.7 36.5 43.5 35.3 70.2 67.8 53.3 45.2 ...
## $ Examination : int 15 6 5 12 17 9 16 14 12 16 ...
## $ Education : int 12 9 5 7 15 7 7 8 7 13 ...
## $ Catholic : num 9.96 84.84 93.4 33.77 5.16 ...
## $ Infant.Mortality: num 22.2 22.2 20.2 20.3 20.6 26.6 23.6 24.9 21 24.4 ...
#Data Preparation and Manipulation
swiss$municipality <- rownames(swiss) # create new column for municipality names
swiss$Fertility_z <- round((swiss$Fertility - mean(swiss$Fertility))/sd(swiss$Fertility), 2) # compute normalized fertility/z-score of the fertility, which is how far each municipality's fertility rate is from the mean fertility rate throughout Switzerland.
swiss$Fertility_type <- ifelse(swiss$Fertility_z < 0, "below", "above") # This ifelse statement creates the new variable Fertility_type, which is used to separate the fertility z-scores of each municipality into categories above and below the average. This variable is used to determine the color of the bars in the barplot below, with municipalities with z scores varying above and below the mean being colored to reflect their orientation.
swiss <- swiss[order(swiss$Fertility_z), ] # sort
swiss$municipality <- factor(swiss$municipality, levels = swiss$municipality) # Converting municipality to a factor retains the sorted order of each municipality value in plot, ensuring that the plotting code treats the data in the factor correctly. The plot will reflect the municipalities in the order we want them to be in, by z-score, because we set the levels argument to swiss$municipality.
#Plotting the Diverging Barplot:
ggplot(swiss, aes(x=municipality, y=Fertility_z, label=Fertility_z)) +
geom_bar(stat='identity', aes(fill=Fertility_type), width=.5) +
theme(text = element_text(size=9)) +
scale_fill_manual(name="Fertility",
labels = c("Above Average", "Below Average"),
values = c("above"="#00ba38", "below"="#f8766d")) +
labs(title= "Divergence in the Average Fertility in Swiss Municipalities",
x = "Municipality",
y = 'Fertility') +
coord_flip()
## The scale_fill_manual() function in this plot gives us control over the legend for our plot. The variable in our legend, Fertility, is mapped to the color fill; when using scale_fill_manual, we are able to specify the exact colors we want to use in our plot, rather than the default of using a different hue on the color wheel for each factor level.
To make a lollipop chart with ggplot2 you can use geom_point() for the circle and geom_segment() for the stem.
let <- data.frame(
x=LETTERS[1:10],
y=runif(10, 2, 9)
)
ggplot(let, aes(x=x, y=y)) +
geom_segment( aes(x=x, xend=x, y=0, yend=y), color="black") +
geom_point( size=4, color="blue",
fill=alpha("purple", 0.2), alpha=.8, shape=21, stroke=4) +
theme_gray() +
theme(
panel.grid.major.x = element_blank(),
panel.border = element_blank(),
axis.ticks.x = element_blank()
) +
xlab("Student") +
ylab("Hours Spent Studying") +
ggtitle("Student Hours of Study (per Week)") +
theme(plot.title = element_text(hjust = 0.5)) +
scale_y_continuous(limits = c(0, 10))
Use the state data that comes with R. Let’s look at state life expectancy. First, prepare the data that we will use for a couple examples.
state_df <- data.frame(state.x77, Region=state.region, Division=state.division)
state_df <- rownames_to_column(state_df, var="State")
state_df$Life.Exp.Z <-
(state_df$Life.Exp - mean(state_df$Life.Exp))/sd(state_df$Life.Exp)
state_df$Above.Avg <- ifelse(state_df$Life.Exp.Z >= 0, "yes", "no")
state_df <- state_df[order(state_df$Life.Exp.Z), ]
state_df$State <- factor(state_df$State, levels = state_df$State)
Next create a simple diverging lollipop chart to highlight the difference of each state’s mean life expectancy from the mean state mean life expectancy.
ggplot(state_df, aes(x=State, y=Life.Exp)) +
geom_point(stat='identity', fill="black", size=2) +
geom_segment(aes(y = mean(Life.Exp),
x = State,
yend = Life.Exp,
xend = State),
color = "black") +
labs(title="State Life Expectancy",
subtitle="Deviation from Mean State Life Expectancy",
y = "Life Expectancy (Years)",
x = NULL) +
scale_y_continuous(breaks = seq(68, 74, 1)) +
coord_flip()
Now let’s try to view a subset. With a smaller amount of data we can add the life expectancies as labels on the dots.
ggplot(filter(state_df, Region == "South"), aes(x=State, y=Life.Exp.Z, label=round(Life.Exp, 1))) +
geom_hline(yintercept = 0, color = "darkgray") +
geom_point(stat='identity', aes(col=Above.Avg), size=6) +
geom_text(color="white", size=2.2) +
labs(y = "Standard Deviations from Mean State Life Expectancy") +
geom_text(aes(x = "West Virginia",
y = -0.05,
label = "Mean State Life Expectancy"),
angle = 90,
color = "darkgray") +
labs(title="Life Expectancy in the Southern States",
subtitle="Compared to Average State Life Expectancy") +
coord_flip() +
theme_bw() +
theme(axis.title.y=element_blank()) +
theme(legend.position = "none")
Using titanic dataset.
slices <- table(titanic$Pclass)
## ^^ we use the table() function to get the count of each class
lbls <- c("1st Class", "2nd Class", "3rd Class")
pct <- round(slices/sum(slices)*100)
lbls <- paste(lbls, pct) #this adds the percentage to the labels
lbls <- paste(lbls,"%",sep="")
pie(slices, labels = lbls, main = "Passenger Class Distribution", col = rainbow(length(lbls)))
#ClassCount <- as.data.frame(table(nums$Pclass))
#I tried to get the data from a csv but I was genuinely unable to
ClassCount <- data.frame(
Class = c("1st", "2nd", "3rd"),
n = c(186, 173, 355),
prop = c(26.1, 24.2, 49.7)
)
ClassCount <- ClassCount %>%
arrange(desc(Class)) %>%
mutate(lab.ypos = cumsum(prop) -0.5*prop)
## the cumsum(prop) - 0.5*prop helps us put the labels in the center of the portion
mycols <- rainbow(length(ClassCount$Class))
ggplot(ClassCount, aes(x = "", y = prop, fill = Class)) +
geom_bar(width = 1, stat = "identity", color = "black") +
coord_polar("y", start = 0) +
geom_text(aes(y = lab.ypos, label = prop), color = "black") +
scale_fill_manual(values = mycols) + theme_void()
#Creating the labels and values for the data
lab <- c("Pitt", "CMU", "Chatham", "Duquesne", "Carlow", "Point Park")
values <- c(34934, 13965, 2317, 9586, 2254, 3682)
#Adding lines and color for the filling of the pie chart and adding the angle parameter to change the angle of each group of lines
colpal = brewer.pal(6, "Set3")
pie(values, labels=lab, main = "Pittsburgh Schools' Student Population", density = 10,
angle=c(90, 45, 60, 30, 75, 20), col = colpal)
# Creating the dataframe
data <- data.frame(
lab <- c("Pitt", "CMU", "Chatham", "Duquesne", "Carlow", "Point Park"),
val <- c(34934, 13965, 2317, 9586, 2254, 3682)
)
#Creating the pie chart
s <-ggplot(data, aes(x="", y=val, fill=lab)) +
geom_bar(stat="identity", width=1) +
coord_polar("y", start=0) +
#Removing grid from the background
theme_void() +
ggtitle("Pittsburgh Schools' Student Population")
s + scale_fill_manual(values=c("#FFFF66", "#9900CC", "#000000", "#FF3300", "#0000CC", "#66CC33"))
Create a chloropleth using color to show population density of European countries. First, download geojson data using the geojson_read() function from the geojsonio package. what = "sp" instructs the function to return the data as a SpatialPolygonsDataFrame type. This type contains vertices data that can be used to draw polygons, as well as additional information associated with the polygon. In this case the polygons are European countries and the associated data includes population and area data from 2005.
We are using the data from “https://raw.githubusercontent.com/leakyMirror/map-of-europe/master/GeoJSON/europe.geojson” stored under spdf
class(spdf)
## [1] "SpatialPolygonsDataFrame"
## attr(,"package")
## [1] "sp"
The SpatialPolygonsDataFrame is implemented as an S4 class in R, so we can extract the data attribute (data associated with the polygons) with @. The data attribute is a data frame.
spdf_data <- spdf@data %>%
mutate(id = row_number())
To use the geospatial data with ggplot we first need to transform it into a data frame using the tidy() function of the broom package. The strtoi() function here is converting the id field in the resulting dataframe to an integer, so that it is the same type as the id field in spdf_data.
spdf_tibble <- broom::tidy(spdf) %>%
mutate(id = strtoi(id))
Merge the geospatial data with the data about the polygons (countries).
# Make the merge
spdf_tibble = spdf_tibble %>%
left_join(. , spdf_data, by="id")
Plot the countries, with color determined by the country’s population.
ggplot() +
geom_polygon(data = spdf_tibble,
aes(fill = POP2005,
x = long,
y = lat,
group = group), size=0, alpha=0.9) +
theme_void() +
coord_map()
Do the same, but use the color palette from the viridis package.
ggplot() +
geom_polygon(data = spdf_tibble,
aes(fill = POP2005,
x = long,
y = lat,
group = group )) +
scale_fill_viridis() +
labs(title = "Population") +
theme_void() +
coord_map()
x <- seq(-4, 4, len = 101)
y <- cbind(sin(x), cos(x))
matplot(x, y, type = "l", xaxt = "n",
main = expression(paste(plain(sin) * phi, " and ",
plain(cos) * phi)),
ylab = expression("sin" * phi, "cos" * phi), # only 1st is taken
xlab = expression(paste("Phase Angle ", phi)),
col.main = "blue")
axis(1, at = c(-pi, -pi/2, 0, pi/2, pi),
labels = expression(-pi, -pi/2, 0, pi/2, pi))
## How to combine "math" and numeric variables :
plot(1:10, type="n", xlab="", ylab="", main = "plot math & numbers")
theta <- 1.23 ; mtext(bquote(hat(theta) == .(theta)))
for(i in 2:9)
text(i,i+1, substitute(list(xi,eta) == group("(",list(x,y),")"),
list(x=i, y=i+1)))
## note that both of these use calls rather than expressions.
plot(1:10, 1:10)
text(4, 9, expression(hat(beta) == (X^t * X)^{-1} * X^t * y))
text(4, 8.4, "expression(hat(beta) == (X^t * X)^{-1} * X^t * y)",
cex = .8)
text(4, 7, expression(bar(x) == sum(frac(x[i], n), i==1, n)))
text(4, 6.4, "expression(bar(x) == sum(frac(x[i], n), i==1, n))",
cex = .8)
text(8, 5, expression(paste(frac(1, sigma*sqrt(2*pi)), " ",
plain(e)^{frac(-(x-mu)^2, 2*sigma^2)})),
cex = 1.2)
Using Phone data L
L1 <-
L %>%select(country,X1995,X1996,X1997,X1998,X1999,X2000,X2001,X2002,X2003,X2004,X2005,)
L2 <-
L1 %>%filter(country=='Argentina'|country=='Brazil'|country=='Chile'|country=='Colombia'|country=='Portugal'|country=='Barbados'|country=='Mexico'|country=='Spain'|country=='Egypt')
for (col in 1:ncol(L2)){
colnames(L2)[col] <- sub('X', '', colnames(L2)[col])
}
L3<-melt(L2)
plot<-ggplot(L3, aes(variable,country,fill= value)) + geom_tile()
plot1<- plot+xlab('Year')+
ylab('Country')+
ggtitle('Mobile Cell Phone Subs 1995-2005')+
labs(fill = 'Per capita subscriptions')
plot1 + scale_fill_distiller(palette = 'Spectral')
# The URL used to query the USGS database
# includes parameters including data type, site number, and dates
URL = 'https://waterdata.usgs.gov/usa/nwis/dv?cb_00060=on&format=rdb&site_no=03049500&referred_module=sw&period=&begin_date=1970-01-01&end_date=2020-12-31'
# data is displayed with some extra comments so it takes a little additional processing to clean up
d = read.delim(URL, comment.char = '#', stringsAsFactors = F)
d=d[-1,]
d$datetime=as.POSIXct(d$datetime)
colnames(d)[4]='discharge'
d$discharge = as.numeric(d$discharge)
# creating month & year variabes for aggregation
d$month=month(d$datetime)
d$year=year(d$datetime)
# aggregating discharge by month & year to return means
aggDischarge=aggregate(discharge~month+year, d, mean)
# plotting just by month, 1 (Jan) is at the bottom and 12 (Dec) is at the top
# there are ways to revserse the order of scales but introducing axis labels complicates that
# I instead am reversing the order of months and using those values for plotting
aggDischarge$plotMonth = abs(aggDischarge$month - 12) + 1
ggplot(aggDischarge, aes(x=year, y=plotMonth, fill=discharge))+
geom_tile()+
scale_fill_viridis(discrete = F)+
scale_y_continuous(limits = c(0, 13), # setting limits to 12 causes the top set of tiles to go missing but increasing it to 13 includes them all
breaks = seq(1, 12, 1), # one break for each month so none are skipped
labels = c('Dec', 'Nov', 'Oct', 'Sep', 'Aug', 'Jul',
'Jun', 'May', 'Apr', 'Mar', 'Feb', 'Jan'),
expand = c(0, 0))+ # removing some empty space above the x axis and below the title
labs(x = 'year', y = 'month', fill = 'Mean Daily\nDischarge\ncu. ft/sec',
title = 'Mean Daily Discharge of the Allegheny River at Natrona, PA')+
theme_classic()
Resource used: https://jozef.io/r400-nhlapi-positional-data/
gameIds <- 2017020001:2017021271
cl <- parallel::makeCluster(parallel::detectCores() / 2)
gameFeeds <- parallel::parLapplyLB(cl, gameIds, nhlapi::nhl_games_feed)
parallel::stopCluster(cl)
saveRDS(gameFeeds, file.path("~", "gamefeeds_regular_2017.rds"))
# Retrieve the data frames with plays from the data
getPlaysDf <- function(gm) {
playsRes <- try(gm[[1L]][["liveData"]][["plays"]][["allPlays"]])
if (inherits(playsRes, "try-error")) data.frame() else playsRes
}
plays <- lapply(gameFeeds, getPlaysDf)
# Bind the list into a single data frame
plays <- nhlapi:::util_rbindlist(plays)
# Keep only the records that have coordinates
plays <- plays[!is.na(plays$coordinates.x), ]
# Move the coordinates to non-negative values before plotting
plays$coordx <- plays$coordinates.x + abs(min(plays$coordinates.x))
plays$coordy <- plays$coordinates.y + abs(min(plays$coordinates.y))
goals <- subset(plays, result.event == "Goal")
ggplot(goals, aes(x = goals$coordx, y = goals$coordy, colour = "Black" )) +
labs(title = "Where are goals scored from on a rink") +
geom_point(alpha = 0.1, size = 0.2) +
xlim(0, 198) + ylim(0, 84) +
geom_density_2d_filled(alpha = 0.35, show.legend = FALSE) +
theme_void()
## Warning: Use of `goals$coordx` is discouraged. Use `coordx` instead.
## Warning: Use of `goals$coordy` is discouraged. Use `coordy` instead.
## Warning: Use of `goals$coordx` is discouraged. Use `coordx` instead.
## Warning: Use of `goals$coordy` is discouraged. Use `coordy` instead.
Resources used https://www.youtube.com/watch?v=GVWSGr4OLj8&list=PLhUoWCH9bWbxje8xauO7wagHzYJclj9cx&index=7
x = c(1, 2, 3, 4, 5, 6, 7)
y = c(8, 9, 10, 11, 12, 13, 14)
data1 = data.frame(x, y)
pointex <- ggplot(data1, aes(x = x, y = y)) +
geom_point() +
labs(x = "examplelength",
y = "examplewidth")
pointex
nums = c(3, 8, 4, 5, 0.2, 0.5, 20)
int = c(0, 3, 9, 5, 1.5, 15, 5.3)
data2 = data.frame(nums, int)
pointex2 <- ggplot(data2, aes(x = nums, y= int)) +
geom_point()+
labs(x="miles",
y="hours")
pointex2
plotex <- list(pointex, pointex2)
grid.arrange(grobs = plotex, ncol = 2, top= "Main Title")
fishy <- read.table(text ="Butterfish Cockle Swordfish Tuna
1 59 49 18 24
2 32 24 71 31", header= TRUE)
barplot(as.matrix(fishy),col=c("blue","purple"))
fish <- read.csv('data/Pescado.csv')
names(fish)
## [1] "X" "Species" "Sex" "Total" "X.1" "X.2"
ggplot(fish, aes(y=Total, x=Species, fill=Sex))+
geom_bar(stat='identity')
Using spd dataset
# Bit of reformating
spd@data = spd@data %>%
mutate(google_name = gsub(" \\(United States\\)", "", google_name))
# Show it
plot(spd)
spd@data = spd@data %>% mutate(google_name = gsub(" \\(United States\\)", "", google_name))
spd_fortified <- tidy(spd, region = "google_name")
# Calculate the centroid of each hexagon to add the label:
centers <- cbind.data.frame(data.frame(gCentroid(spd, byid=TRUE), id=spd@data$iso3166_2))
# Now I can plot this shape easily as described before:
ggplot() +
geom_polygon(data = spd_fortified, aes( x = long, y = lat, group = group), fill="skyblue", color="white") +
geom_text(data=centers, aes(x=x, y=y, label=id)) +
theme_void() +
coord_map()
Using hockey dataset
spd_fortified <- spd_fortified %>%
left_join(. , hockey, by=c("id"="State"))
# Make a first chloropleth map
ggplot() +
geom_polygon(data = spd_fortified, aes(fill = Searches, x = long, y = lat, group = group)) +
geom_text(data=centers, aes(x=x, y=y, label=id)) +
scale_fill_gradient(trans = "log") +
theme_void() +
coord_map() +
ggtitle("Google Search Popularity for Hockey by State")
###I used a package called “ggsoccer”, and you can build a soccer pitch with this package. you can build the soccer pitch from scratch like I did before knowing ggsoccer, but it takes a lot of time and you might have mistakes because it is long
ggplot() +
annotate_pitch() +
theme_pitch()
### this is how you can build a soccer pitch from scratch
soccer_ns <- ggplot()+
geom_rect(aes(xmin = -10, xmax = 100, ymin= -10, ymax = 130), fill = "#669933", colour = "#FFFFFF", size = 0.5)+ ### Background outline
geom_rect(aes(xmin = 0, xmax = 90, ymin= 0, ymax = 120), fill = "#669933", colour = "#FFFFFF", size = 0.5)+ ###Sideline, Endline
geom_rect(aes(xmin = 0, xmax = 90, ymin= 0, ymax = 60), fill = NA , colour = "#FFFFFF", size = 0.5)+ #Halfway
geom_rect(aes(xmin = 24.85, xmax = 65.15, ymin= 0, ymax = 16.5), fill = "#669933", colour = "#FFFFFF", size = 0.5)+ #lower 18 yard box
geom_rect(aes(xmin = 35.85, xmax = 54.15, ymin= 0, ymax = 5.5), fill = "#669933", colour = "#FFFFFF", size = 0.5)+ # Lower 5 yard box
geom_rect(aes(xmin = 24.85, xmax = 65.15, ymin= 103.5, ymax = 120), fill = "#669933", colour = "#FFFFFF", size = 0.5)+ #upper 18 yard box
geom_rect(aes(xmin = 35.85, xmax = 54.15, ymin= 114.5, ymax = 120), fill = "#669933", colour = "#FFFFFF", size = 0.5)+ #upper 5 yard box
geom_curve(aes(x = 35.85, y = 16.5, xend = 54.15, yend = 16.5), curvature = -0.6, colour = "#FFFFFF")+ #lower D
geom_curve(aes(x = 35.85, y = 103.5, xend = 54.15, yend = 103.5), curvature = .6, colour = "#FFFFFF")+ #upper D
ggforce::geom_circle(aes(x0=45, y0=60, r= 9.15), colour = "#FFFFFF")+ ##Halfway circle
theme(rect = element_blank(),
line = element_blank(),
text = element_blank())
### By default the pitch is white and grey, so we need to change the pitch color from white to green and the lines from grey to white
ggplot() +
annotate_pitch(colour = "white",
fill = "#3ab54a") +
theme_pitch() +
theme(panel.background = element_rect(fill = "#3ab54a"))
###And here I created a shot map that basically shows the shots on target that it is shown in orange and the off target shots are shown in blue.
# Data
set.seed(1)
df <- data.frame(x = rnorm(20, 80, 10),
y = rnorm(20, 50, 20),
Shot = sample(c("In", "Out"),
40, replace = TRUE))
ggplot(df) +
annotate_pitch(colour = "white",
fill = "#3ab54a") +
geom_point(aes(x = x, y = y, fill = Shot),
shape = 21,
size = 4) +
coord_cartesian(xlim = c(45, 105))+
theme_pitch() +
theme(panel.background = element_rect(fill = "#3ab54a"))
# Create test data
fruit_survey <- data.frame(
fruit=c("Apple", "Banana", "Honeydew", "Watermelon"),
count=c(15, 25, 20, 40)
)
# Compute percentages
fruit_survey$fraction = fruit_survey$count / sum(fruit_survey$count)
# Compute the cumulative percentages (top of each rectangle)
fruit_survey$ymax = cumsum(fruit_survey$fraction)
# Compute the bottom of each rectangle
fruit_survey$ymin = c(0, head(fruit_survey$ymax, n=-1))
# Make the plot
ggplot(fruit_survey, aes(ymax=ymax, ymin=ymin, xmax=4, xmin=3, fill=fruit)) +
geom_rect() +
coord_polar(theta="y") +
xlim(c(2, 4))
It is possible to build a doughnut chart in base R; however, it requires a very long ‘doughnut’ function. The function is included below, and the additional code to create an actual chart. The example is taken from https://www.r-graph-gallery.com/130-ring-or-donut-chart.html. It is suggested to just use ggplot whenever you want to create this kind of chart.
doughnut <-
function (x, labels = names(x), edges = 200, outer.radius = 0.8,
inner.radius=0.6, clockwise = FALSE,
init.angle = if (clockwise) 90 else 0, density = NULL,
angle = 45, col = NULL, border = FALSE, lty = NULL,
main = NULL, ...)
{
if (!is.numeric(x) || any(is.na(x) | x < 0))
stop("'x' values must be positive.")
if (is.null(labels))
labels <- as.character(seq_along(x))
else labels <- as.graphicsAnnot(labels)
x <- c(0, cumsum(x)/sum(x))
dx <- diff(x)
nx <- length(dx)
plot.new()
pin <- par("pin")
xlim <- ylim <- c(-1, 1)
if (pin[1L] > pin[2L])
xlim <- (pin[1L]/pin[2L]) * xlim
else ylim <- (pin[2L]/pin[1L]) * ylim
plot.window(xlim, ylim, "", asp = 1)
if (is.null(col))
col <- if (is.null(density))
palette()
else par("fg")
col <- rep(col, length.out = nx)
border <- rep(border, length.out = nx)
lty <- rep(lty, length.out = nx)
angle <- rep(angle, length.out = nx)
density <- rep(density, length.out = nx)
twopi <- if (clockwise)
-2 * pi
else 2 * pi
t2xy <- function(t, radius) {
t2p <- twopi * t + init.angle * pi/180
list(x = radius * cos(t2p),
y = radius * sin(t2p))
}
for (i in 1L:nx) {
n <- max(2, floor(edges * dx[i]))
P <- t2xy(seq.int(x[i], x[i + 1], length.out = n),
outer.radius)
polygon(c(P$x, 0), c(P$y, 0), density = density[i],
angle = angle[i], border = border[i],
col = col[i], lty = lty[i])
Pout <- t2xy(mean(x[i + 0:1]), outer.radius)
lab <- as.character(labels[i])
if (!is.na(lab) && nzchar(lab)) {
lines(c(1, 1.05) * Pout$x, c(1, 1.05) * Pout$y)
text(1.1 * Pout$x, 1.1 * Pout$y, labels[i],
xpd = TRUE, adj = ifelse(Pout$x < 0, 1, 0),
...)
}
## Add white disc
Pin <- t2xy(seq.int(0, 1, length.out = n*nx),
inner.radius)
polygon(Pin$x, Pin$y, density = density[i],
angle = angle[i], border = border[i],
col = "white", lty = lty[i])
}
title(main = main, ...)
invisible(NULL)
}
# inner.radius controls the width of the ring
doughnut( c(3,5,9,12) , inner.radius=0.5, col=c(rgb(0.2,0.2,0.4,0.5), rgb(0.8,0.2,0.4,0.5), rgb(0.2,0.9,0.4,0.4) , rgb(0.0,0.9,0.8,0.4)) )
## Warning in rep(lty, length.out = nx): 'x' is NULL so the result will be NULL
## Warning in rep(density, length.out = nx): 'x' is NULL so the result will be NULL
After researching doughnut charts on several sites, I learned that they, along with pie charts are not very well liked by professionals. It was mentioned that readers have a hard time with reading angles, so charts like bar charts and histograms are more easily understood.
I’ve been on something of a boxing kick for the past weeks, so I thought I’d put together a grouped barplot comparing the win-loss records of Muhammad Ali, Joe Frazier, George Foreman, and Mike Tyson.
records <- matrix(c(56, 5, 32, 4, 76, 5, 50, 6), nrow = 2, ncol = 4)
rownames(records) <- c("Wins", "Losses")
colnames(records) <- c("Muhammad Ali", "Joe Frazier", "George Foreman", "Mike Tyson")
barplot(records, col = c("green", "red"), xlab = "Boxers", ylab = "Score", main = "The Four Heavenly Kings of Boxing", beside = T, legend = rownames(records))
This creates a double bar plot making such a comparison, with the green-colored bars noting the wins, and the red-colored bars noting the losses, with one “column” for each boxer. The Y axis denotes the “score”, as in, a numeric count of wins/losses.
ggparcoord(data = nba_players,
columns = 4:8,
groupColumn = 'TEAM',
scale = 'globalminmax',
#scale = 'center',
order = 'skewness',
alphaLines = .3)+
ggtitle('NBA AVG Distribution')
names(card)
## [1] "face" "suit" "value"
ggparcoord(data = card,
columns = 1:3,
alphaLines = .3,
boxplot = TRUE,
showPoints = TRUE)+
ggtitle('Playing Card Distribution')
Plot showing the distribution of points within each NBA team
##libary found at the top (ggridges)
ggplot(nba_players, aes(x = PTS, y = TEAM)) +
geom_density_ridges() +
theme_ridges() +
theme(legend.position = "none")
## Picking joint bandwidth of 2.81
Using Age as the transition to see how many points and Games Played per player. The larger the Bubble, the more Field Goals Made. Color represents Wins
a <- nba_players %>%
ggplot( aes(x = GP, y= PTS, size = FGM, color = W)) +
geom_point(alpha = 0.7) +
scale_x_log10() +
theme_bw() +
labs(title = "Age: {frame_time}", x = "Games Played", y = "Points") +
transition_time(AGE) +
ease_aes("bounce-in")
animate(plot = a)
## plotMath with random Data: Area V.S Cicumference
r <- rnorm(30)
Area <- (pi * (r)^2)
circumference <- (2 * pi * r)
plot(Area, circumference,
type = 'p',
main = ("Area V.S Circumference"),
#expression() is used to add symbols to graph ONLY
#expression(paste()) is used to add both text and symbol to the graph
xlab= expression(paste("Area = ", pi * r^2)),
ylab = expression(paste("Circumference = ", 2 * pi * r)),
)
## candlestick chart
### For the data I used real data from the stocks by using the library “quantmod” and I decided to go with Facebook as you know that recently and specifically on 10/4/2021 all Facebook servers went down so I used the candlestick chart to see how did that issue affected Facebook in the market.
library("quantmod")
# get the Facebook Stock data
### So basically you get the company stock name and adjust the date. I decided to do just 3 months until the servers went down. Also, the periodicity can change but I wanted to go with daily so I can tel the difference. The chart shows that now is a great time to buy stocks.
Facebook<-getSymbols("FB",
from = "2021-7-1",
to = "2021-10-5",
periodicity = "daily",
auto.assign=FALSE)
## 'getSymbols' currently uses auto.assign=TRUE by default, but will
## use auto.assign=FALSE in 0.5-0. You will still be able to use
## 'loadSymbols' to automatically load data. getOption("getSymbols.env")
## and getOption("getSymbols.auto.assign") will still be checked for
## alternate defaults.
##
## This message is shown once per session and may be disabled by setting
## options("getSymbols.warning4.0"=FALSE). See ?getSymbols for details.
chartSeries(Facebook, name="Facebook Chart")